Raw data

#read in data
mailchimp_users_tbl <- read_rds(here("data","mailchimp_users.rds"))
mailchimp_users_tbl %>% glimpse()
## Rows: 23,672
## Columns: 10
## $ euid          <chr> "000b09860b", "000d76e3f8", "0010ca98c6", "00129d76f9", …
## $ leid          <chr> "308851471", "71513779", "62999259", "68149639", "708797…
## $ member_rating <int> 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,…
## $ optin_time    <date> 2019-10-16, 2019-05-22, 2018-11-19, 2019-02-27, 2019-05…
## $ confirm_time  <date> 2019-10-16, 2019-05-22, 2018-11-19, 2019-02-27, 2019-05…
## $ country_code  <chr> "us", "in", "lu", "us", NA, NA, NA, "in", NA, "ve", "mx"…
## $ region        <chr> "fl", "tn", "lu", "ca", NA, NA, NA, "as", NA, "a", "jal"…
## $ last_changed  <date> 2019-10-21, 2020-01-09, 2019-10-21, 2020-01-09, 2020-01…
## $ notes         <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ date_added    <date> 2020-03-02, 2020-03-02, 2020-03-02, 2020-03-02, 2020-03…
#counting number of events per day
#daily summary 
optins_day_tbl <- mailchimp_users_tbl %>% 
    summarise_by_time(
        .date_var = optin_time,
        .by = "day",
         optins = n()) 

optins_day_tbl%>% head()
## # A tibble: 6 × 2
##   optin_time optins
##   <date>      <int>
## 1 2018-06-08      1
## 2 2018-07-03     10
## 3 2018-07-04     15
## 4 2018-07-05      9
## 5 2018-07-06     11
## 6 2018-07-07      4
#weekly summary 
mailchimp_users_tbl %>% 
    summarise_by_time(
        .date_var = optin_time,
        .by = "week",
        optins = n()) %>% head()
## # A tibble: 6 × 2
##   optin_time optins
##   <date>      <int>
## 1 2018-06-03      1
## 2 2018-07-01     49
## 3 2018-07-08     73
## 4 2018-07-15     62
## 5 2018-07-22     51
## 6 2018-07-29     52
#monthly summary 
mailchimp_users_tbl %>% 
    summarise_by_time(
        .date_var = optin_time,
        .by = "month",
        optins = n()) %>% head()
## # A tibble: 6 × 2
##   optin_time optins
##   <date>      <int>
## 1 2018-06-01      1
## 2 2018-07-01    254
## 3 2018-08-01    210
## 4 2018-09-01    236
## 5 2018-10-01    267
## 6 2018-11-01   3955

tk_summary_diagnostics()

  • index, the date or date-time column is called index

  • units: the description of a single timestamp within a time series

  • scale: the most common difference between timestamps within a time series. It is also called interval, frequency, period or periodicity

  • Differencey Summary

    • Charaterizes the scale (interval) between time stamps in seconds
    • 1 day = 86,400 seconds
  • optins_day_tbl is irregular time sereis. Need to fill in gaps (i.e., missing days) before conducting the analysis

#notice diff.mean 
optins_day_tbl %>% tk_summary_diagnostics(.date_var = optin_time)
## # A tibble: 1 × 12
##   n.obs start      end        units scale tzone diff.minimum diff.q1 diff.median
##   <int> <date>     <date>     <chr> <chr> <chr>        <dbl>   <dbl>       <dbl>
## 1   608 2018-06-08 2020-03-02 days  day   UTC          86400   86400       86400
## # ℹ 3 more variables: diff.mean <dbl>, diff.q3 <dbl>, diff.maximum <dbl>

pad_by_time()

  • Performs time-series padding filling in any gaps, to convert the time series to a regular time series.

Cleaning Data

optins_day_prepared_tbl

optins_day_tbl %>% 
    pad_by_time(
        .date_var = optin_time
    ) %>% head()
## pad applied on the interval: day
## # A tibble: 6 × 2
##   optin_time optins
##   <date>      <int>
## 1 2018-06-08      1
## 2 2018-06-09     NA
## 3 2018-06-10     NA
## 4 2018-06-11     NA
## 5 2018-06-12     NA
## 6 2018-06-13     NA
subscribers_daily_tbl <- mailchimp_users_tbl %>% 
    summarise_by_time(
        .date_var = optin_time,
        .by       = "day",
        optins    = n()
        ) %>% 
    pad_by_time(.by="day", .pad_value = 0)
## .date_var is missing. Using: optin_time
subscribers_daily_tbl %>% head()
## # A tibble: 6 × 2
##   optin_time optins
##   <date>      <int>
## 1 2018-06-08      1
## 2 2018-06-09      0
## 3 2018-06-10      0
## 4 2018-06-11      0
## 5 2018-06-12      0
## 6 2018-06-13      0
#notice diff.mean 
subscribers_daily_tbl %>% tk_summary_diagnostics(.date_var = optin_time)
## # A tibble: 1 × 12
##   n.obs start      end        units scale tzone diff.minimum diff.q1 diff.median
##   <int> <date>     <date>     <chr> <chr> <chr>        <dbl>   <dbl>       <dbl>
## 1   634 2018-06-08 2020-03-02 days  day   UTC          86400   86400       86400
## # ℹ 3 more variables: diff.mean <dbl>, diff.q3 <dbl>, diff.maximum <dbl>

plot_time_series()

subscribers_daily_tbl %>% 
    plot_time_series(.date_var = optin_time, .value = optins)
subscribers_daily_tbl %>% 
 plot_anomaly_diagnostics(
     .date_var = optin_time,
     .value = optins,
     .alpha = 0.01
 )
## frequency = 7 observations per 1 week
## trend = 92 observations per 3 months
  • Perform log transformation
subscribers_daily_tbl %>% 
    plot_time_series(optin_time, log(optins +1)) 

ACF and PACF

  • Transformation (i.e., log1p) is absolutely critical in identifying lags and using lags in models.
    • Without transformation the ACF plot shows almost no correlation.
  • With transformation, we can see which lags are potential features.
    • The plot below shows local maximum repeating after lag multiple of 7
subscribers_daily_tbl %>% 
    plot_acf_diagnostics(optin_time, 
                         optins,
                         .lags = 100)
subscribers_daily_tbl %>% 
    plot_acf_diagnostics(optin_time, log(optins+1))
## Max lag exceeds data available. Using max lag: 633
subscribers_daily_tbl %>% tk_stl_diagnostics(
        .date_var = optin_time,
        .value = optins
    )
## frequency = 7 observations per 1 week
## trend = 92 observations per 3 months
## # A tibble: 634 × 6
##    optin_time observed season trend remainder seasadj
##    <date>        <dbl>  <dbl> <dbl>     <dbl>   <dbl>
##  1 2018-06-08        1 -0.121 0.248     0.873   1.12 
##  2 2018-06-09        0 -5.10  0.415     4.68    5.10 
##  3 2018-06-10        0 -3.32  0.583     2.74    3.32 
##  4 2018-06-11        0  2.40  0.750    -3.15   -2.40 
##  5 2018-06-12        0  2.68  0.918    -3.59   -2.68 
##  6 2018-06-13        0  3.08  1.09     -4.16   -3.08 
##  7 2018-06-14        0  0.390 1.25     -1.64   -0.390
##  8 2018-06-15        0 -0.121 1.42     -1.30    0.121
##  9 2018-06-16        0 -5.10  1.59      3.51    5.10 
## 10 2018-06-17        0 -3.32  1.76      1.57    3.32 
## # ℹ 624 more rows
  • Why do we need to transform the data?
    • Without transformation, you could be predicting negative values.
subscribers_daily_tbl %>%
    plot_time_series_regression(
        .date_var = optin_time,
        .formula = optins ~ as.numeric(optin_time) +
            wday(optin_time, label = TRUE) +
            month(optin_time, label = TRUE),
        .show_summary = TRUE
    )
## 
## Call:
## stats::lm(formula = .formula, data = df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -108.2  -29.5   -7.2    5.6 3210.6 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)  
## (Intercept)                        -880.48792  590.84981  -1.490   0.1367  
## as.numeric(optin_time)                0.05094    0.03281   1.553   0.1211  
## wday(optin_time, label = TRUE).L    -17.56395   14.70202  -1.195   0.2327  
## wday(optin_time, label = TRUE).Q    -32.03078   14.73770  -2.173   0.0301 *
## wday(optin_time, label = TRUE).C     14.23163   14.71486   0.967   0.3338  
## wday(optin_time, label = TRUE)^4    -10.93010   14.69951  -0.744   0.4574  
## wday(optin_time, label = TRUE)^5     24.05748   14.72539   1.634   0.1028  
## wday(optin_time, label = TRUE)^6    -15.50517   14.75169  -1.051   0.2936  
## month(optin_time, label = TRUE).L     3.47389   19.23763   0.181   0.8568  
## month(optin_time, label = TRUE).Q    16.88664   19.78001   0.854   0.3936  
## month(optin_time, label = TRUE).C    -3.57288   20.14312  -0.177   0.8593  
## month(optin_time, label = TRUE)^4   -31.83980   19.44091  -1.638   0.1020  
## month(optin_time, label = TRUE)^5    -9.52191   19.75561  -0.482   0.6300  
## month(optin_time, label = TRUE)^6   -38.58033   19.65726  -1.963   0.0501 .
## month(optin_time, label = TRUE)^7   -12.22472   20.44351  -0.598   0.5501  
## month(optin_time, label = TRUE)^8   -19.82784   20.91266  -0.948   0.3434  
## month(optin_time, label = TRUE)^9     2.47483   21.09561   0.117   0.9066  
## month(optin_time, label = TRUE)^10   -2.43759   21.57412  -0.113   0.9101  
## month(optin_time, label = TRUE)^11   -4.83019   19.92557  -0.242   0.8085  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 140 on 615 degrees of freedom
## Multiple R-squared:  0.04065,    Adjusted R-squared:  0.01257 
## F-statistic: 1.448 on 18 and 615 DF,  p-value: 0.1032

log transformatoin

## The following code will return error
# subscribers_daily_tbl %>%
#     plot_time_series(optin_time, log(optins))

# Log Plus 1
subscribers_daily_tbl %>%
    plot_time_series(optin_time, log1p(optins))
# Inversion
subscribers_daily_tbl %>%
    plot_time_series(optin_time, log1p(optins) %>% expm1())
# Benefit
subscribers_daily_tbl %>%
    plot_time_series_regression(
        .date_var = optin_time,
        .formula = log1p(optins) ~ as.numeric(optin_time) +
            wday(optin_time, label = TRUE) +
            month(optin_time, label = TRUE),
        .show_summary = TRUE
    )
## 
## Call:
## stats::lm(formula = .formula, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.6906 -0.4537 -0.0482  0.4072  5.3262 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        -5.044e+01  3.364e+00 -14.994  < 2e-16 ***
## as.numeric(optin_time)              2.965e-03  1.868e-04  15.871  < 2e-16 ***
## wday(optin_time, label = TRUE).L   -1.664e-01  8.371e-02  -1.988 0.047209 *  
## wday(optin_time, label = TRUE).Q   -7.487e-01  8.391e-02  -8.922  < 2e-16 ***
## wday(optin_time, label = TRUE).C    3.820e-02  8.378e-02   0.456 0.648621    
## wday(optin_time, label = TRUE)^4   -1.451e-02  8.369e-02  -0.173 0.862376    
## wday(optin_time, label = TRUE)^5    8.348e-02  8.384e-02   0.996 0.319785    
## wday(optin_time, label = TRUE)^6   -1.178e-01  8.399e-02  -1.403 0.161245    
## month(optin_time, label = TRUE).L  -3.101e-01  1.095e-01  -2.832 0.004784 ** 
## month(optin_time, label = TRUE).Q   4.351e-01  1.126e-01   3.863 0.000124 ***
## month(optin_time, label = TRUE).C   1.380e-01  1.147e-01   1.203 0.229248    
## month(optin_time, label = TRUE)^4  -5.089e-01  1.107e-01  -4.598 5.19e-06 ***
## month(optin_time, label = TRUE)^5   2.928e-01  1.125e-01   2.603 0.009451 ** 
## month(optin_time, label = TRUE)^6   6.176e-03  1.119e-01   0.055 0.956014    
## month(optin_time, label = TRUE)^7  -3.219e-01  1.164e-01  -2.765 0.005860 ** 
## month(optin_time, label = TRUE)^8  -2.172e-01  1.191e-01  -1.824 0.068574 .  
## month(optin_time, label = TRUE)^9   2.509e-01  1.201e-01   2.089 0.037101 *  
## month(optin_time, label = TRUE)^10  5.022e-02  1.228e-01   0.409 0.682831    
## month(optin_time, label = TRUE)^11 -4.688e-01  1.135e-01  -4.132 4.10e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7971 on 615 degrees of freedom
## Multiple R-squared:  0.4693, Adjusted R-squared:  0.4538 
## F-statistic: 30.22 on 18 and 615 DF,  p-value: < 2.2e-16

Feature Engineering

transformation

  • need to capture the parameters used for transformation
data_prepared_tbl <- subscribers_daily_tbl %>%
     # Preprocessing
    mutate(optins_trans = log_interval_vec(optins, limit_lower = 0, offset = 1)) %>%
    mutate(optins_trans = standardize_vec(optins_trans)) %>%
    
    # Fix missing values at beginning of series
    filter_by_time(.start_date = "2018-07-03") %>%
    
    # Cleaning
    # replacing the outlier with the cleaned, the red plot during 
    # the time period specified within between_time()
    mutate(optins_trans_cleaned = ts_clean_vec(optins_trans, period = 7)) %>%
    mutate(optins_trans = ifelse(optin_time %>% between_time("2018-11-18", "2018-11-20"), 
                                 optins_trans_cleaned,
                                 optins_trans)) %>%
    
    select(-optins, -optins_trans_cleaned)
## log_interval_vec(): 
##  Using limit_lower: 0
##  Using limit_upper: 3650.8
##  Using offset: 1
## Standardization Parameters
## mean: -5.25529020756467
## standard deviation: 1.1109817111334
## .date_var is missing. Using: optin_time
data_prepared_tbl %>%
    pivot_longer(contains("trans")) %>%
    plot_time_series(optin_time, value, name)
#################################
# Save Key Params
# We need them to convert them back to
# original scale
##################################
limit_lower <- 0
limit_upper <- 3650.8
offset      <- 1
std_mean    <- -5.25529020756467
std_sd      <- 1.1109817111334

data_prepared_full_tbl

h vs new_data

  1. Extend to Future Window
  • It is important to know your forecast horizon upfront.
  • This affects your ability to make features & how far to extend your full dataset
  1. Add any lags to full dataset
#prdiction horizon
horizon    <- 8*7

#M5 Competition, feature engineering was
#critical to success and this is something
#that M5 Competition winner did
#used to create rolling averages
#engineered features
#8 weeks and 7 days per week
lag_period <- 8*7
rolling_periods <- c(30, 60, 90)

Create Xreg

  • We can create additional features similar to trend() in TSLM.
data_prepared_full_tbl <- data_prepared_tbl %>%
    
    # Add future window
    bind_rows(
        future_frame(.data = ., .date_var = optin_time, .length_out = horizon)
    ) %>%
    
    # Add Autocorrelated Lags
    tk_augment_lags(optins_trans, .lags = lag_period) %>% 
    
    # Add rolling features
    tk_augment_slidify(
        .value   = optins_trans_lag56,
        .f       = mean, 
        .period  = rolling_periods,
        .align   = "center",
        .partial = TRUE
    ) 

data_prepared_full_tbl %>% pivot_longer(-optin_time) %>% 
    plot_time_series(.date_var = optin_time,
                     value, name, .smooth= FALSE)
data_prepared_full_tbl %>% head() %>% 
     kable("html") %>% 
     kable_styling(bootstrap_options = c("striped", "hover"))
optin_time optins_trans optins_trans_lag56 optins_trans_lag56_roll_30 optins_trans_lag56_roll_60 optins_trans_lag56_roll_90
2018-07-03 -0.4919060 NA NA NA NA
2018-07-04 -0.1534053 NA NA NA NA
2018-07-05 -0.5779424 NA NA NA NA
2018-07-06 -0.4133393 NA NA NA NA
2018-07-07 -1.2030828 NA NA NA NA
2018-07-08 -1.6633730 NA NA NA NA
data_prepared_full_tbl %>% tail() %>% 
     kable("html") %>% 
    kable_styling(bootstrap_options = c("striped", "hover"))
optin_time optins_trans optins_trans_lag56 optins_trans_lag56_roll_30 optins_trans_lag56_roll_60 optins_trans_lag56_roll_90
2020-04-22 NA 1.7391664 1.081005 0.7841984 0.6612021
2020-04-23 NA 2.5450484 1.117631 0.8045394 0.6618256
2020-04-24 NA 1.5884589 1.089140 0.8163169 0.6675904
2020-04-25 NA 0.4158820 1.087291 0.8376167 0.6723126
2020-04-26 NA 0.5295369 1.102533 0.8554082 0.6839993
2020-04-27 NA -0.3410451 1.093397 0.8976997 0.6936319

data_prepared_tbl and forecast_tbl

  • SEPARATE INTO MODELING & FORECAST DATA
data_prepared_full_tbl %>% tail(57)
## # A tibble: 57 × 6
##    optin_time optins_trans optins_trans_lag56 optins_trans_lag56_roll_30
##    <date>            <dbl>              <dbl>                      <dbl>
##  1 2020-03-02       -0.341             1.71                        0.387
##  2 2020-03-03       NA                 1.06                        0.443
##  3 2020-03-04       NA                 2.07                        0.463
##  4 2020-03-05       NA                 1.36                        0.509
##  5 2020-03-06       NA                 0.251                       0.524
##  6 2020-03-07       NA                -0.779                       0.534
##  7 2020-03-08       NA                 0.0926                      0.556
##  8 2020-03-09       NA                 0.631                       0.540
##  9 2020-03-10       NA                 0.385                       0.521
## 10 2020-03-11       NA                 0.446                       0.511
## # ℹ 47 more rows
## # ℹ 2 more variables: optins_trans_lag56_roll_60 <dbl>,
## #   optins_trans_lag56_roll_90 <dbl>
#will be used to create train and test
data_prepared_tbl <- data_prepared_full_tbl %>%
    filter(!is.na(optins_trans))
data_prepared_tbl
## # A tibble: 609 × 6
##    optin_time optins_trans optins_trans_lag56 optins_trans_lag56_roll_30
##    <date>            <dbl>              <dbl>                      <dbl>
##  1 2018-07-03      -0.492                  NA                         NA
##  2 2018-07-04      -0.153                  NA                         NA
##  3 2018-07-05      -0.578                  NA                         NA
##  4 2018-07-06      -0.413                  NA                         NA
##  5 2018-07-07      -1.20                   NA                         NA
##  6 2018-07-08      -1.66                   NA                         NA
##  7 2018-07-09      -0.274                  NA                         NA
##  8 2018-07-10      -0.212                  NA                         NA
##  9 2018-07-11      -0.0986                 NA                         NA
## 10 2018-07-12      -0.274                  NA                         NA
## # ℹ 599 more rows
## # ℹ 2 more variables: optins_trans_lag56_roll_60 <dbl>,
## #   optins_trans_lag56_roll_90 <dbl>
#will be used to make forecast
forecast_tbl <- data_prepared_full_tbl %>%
    filter(is.na(optins_trans))
forecast_tbl
## # A tibble: 56 × 6
##    optin_time optins_trans optins_trans_lag56 optins_trans_lag56_roll_30
##    <date>            <dbl>              <dbl>                      <dbl>
##  1 2020-03-03           NA             1.06                        0.443
##  2 2020-03-04           NA             2.07                        0.463
##  3 2020-03-05           NA             1.36                        0.509
##  4 2020-03-06           NA             0.251                       0.524
##  5 2020-03-07           NA            -0.779                       0.534
##  6 2020-03-08           NA             0.0926                      0.556
##  7 2020-03-09           NA             0.631                       0.540
##  8 2020-03-10           NA             0.385                       0.521
##  9 2020-03-11           NA             0.446                       0.511
## 10 2020-03-12           NA             0.135                       0.524
## # ℹ 46 more rows
## # ℹ 2 more variables: optins_trans_lag56_roll_60 <dbl>,
## #   optins_trans_lag56_roll_90 <dbl>

train/test

#make the assess equal to your forecasting period
#Cumulative = TRUE uses all of the previous data in the dataset
splits <- time_series_split(data_prepared_tbl, assess = horizon, cumulative = TRUE)
## Using date_var: optin_time
splits %>%
    tk_time_series_cv_plan() %>%
    plot_time_series_cv_plan(optin_time, optins_trans)

create base recipe

  • It is important to try different feature engineering sets.

  • recipe() defines the data preprocessing operations.

    • recipes are data dependent. New data must have the same column names and classes for the recipe to be applied.
    • Recipes are resuable, meaning we can use them for multiple models.
    • Recipes can be modified. We can add more steps and remove features.
  • Feature engineering is the most critical part of time series analysis.

    • Maximizing performance requires a lot of experimentation.
  • Multiple recipes

    • Can make a base recipe with most steps
    • Then create model-specific recipes that modify the base.
      • spline model in the example below will use natural splines to model trend.
      • lag model in the example below will use Lag + Rolling features to model trend.
  • Many ML model will return error if you fed it with time format data.

step_timeseries_signature()

  • adds a preprocessing step to generate the time series signature features.

step_rm()

  • used to remove features that are unnecessary.
  • matches() is a tidyselect helper that allows us to use Regular Expressions (RegEx) to select column names
    • used () to create multi-regex search patterns

step_normalize()

  • in recipe is equivalent to standardize_vec()
  • recall that timeTK author calls normalization process as standardization

step_range()

  • is equivalent to normalize_vec()

step_dummy()

  • performs categorical encoding for either dummy encoding or one-hot encoding.

  • Not all ML modles handles categorical data in this way. So need to perform preprocessing to be on the safe side.

  • all_nominal() a recipe column selector that selects any columns that are categorical.

    • See also: all_numeric() and all_predictors()

step_fourier()

  • Adds fourier series features
recipe_spec_base <- recipe(optins_trans ~ ., data = training(splits)) %>%
    
    # Time Series Signature
    step_timeseries_signature(optin_time) %>%
    step_rm(matches("(iso)|(xts)|(hour)|(minute)|(second)|(am.pm)")) %>%
    
    # Standardization
    step_normalize(matches("(index.num)|(year)|(yday)")) %>%
    
    # Dummy Encoding (One Hot Encoding)
    step_dummy(all_nominal(), one_hot = TRUE) %>%
    
    # Interaction
    step_interact(~ matches("week2") * matches("wday.lbl")) %>%
    
    # Fourier
    step_fourier(optin_time, period = c(7, 14, 30, 90, 365), K = 2)

# juice() takes the training dataset out
recipe_spec_base %>% prep() %>% juice() %>% glimpse()
## Rows: 553
## Columns: 68
## $ optin_time                               <date> 2018-07-03, 2018-07-04, 2018…
## $ optins_trans_lag56                       <dbl> NA, NA, NA, NA, NA, NA, NA, N…
## $ optins_trans_lag56_roll_30               <dbl> NA, NA, NA, NA, NA, NA, NA, N…
## $ optins_trans_lag56_roll_60               <dbl> NA, NA, NA, NA, NA, NA, NA, N…
## $ optins_trans_lag56_roll_90               <dbl> NA, NA, NA, NA, NA, NA, NA, N…
## $ optins_trans                             <dbl> -0.49190597, -0.15340526, -0.…
## $ optin_time_index.num                     <dbl> -1.727358, -1.721099, -1.7148…
## $ optin_time_year                          <dbl> -1.394192, -1.394192, -1.3941…
## $ optin_time_half                          <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,…
## $ optin_time_quarter                       <int> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,…
## $ optin_time_month                         <int> 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,…
## $ optin_time_day                           <int> 3, 4, 5, 6, 7, 8, 9, 10, 11, …
## $ optin_time_wday                          <int> 3, 4, 5, 6, 7, 1, 2, 3, 4, 5,…
## $ optin_time_mday                          <int> 3, 4, 5, 6, 7, 8, 9, 10, 11, …
## $ optin_time_qday                          <int> 3, 4, 5, 6, 7, 8, 9, 10, 11, …
## $ optin_time_yday                          <dbl> -0.26427519, -0.25454717, -0.…
## $ optin_time_mweek                         <int> 1, 1, 1, 1, 1, 1, 2, 2, 2, 2,…
## $ optin_time_week                          <int> 27, 27, 27, 27, 27, 27, 28, 2…
## $ optin_time_week2                         <int> 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,…
## $ optin_time_week3                         <int> 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,…
## $ optin_time_week4                         <int> 3, 3, 3, 3, 3, 3, 0, 0, 0, 0,…
## $ optin_time_mday7                         <int> 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,…
## $ optin_time_month.lbl_01                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_02                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_03                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_04                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_05                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_06                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_07                  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ optin_time_month.lbl_08                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_09                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_10                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_11                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_12                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_wday.lbl_1                    <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,…
## $ optin_time_wday.lbl_2                    <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,…
## $ optin_time_wday.lbl_3                    <dbl> 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,…
## $ optin_time_wday.lbl_4                    <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,…
## $ optin_time_wday.lbl_5                    <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,…
## $ optin_time_wday.lbl_6                    <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,…
## $ optin_time_wday.lbl_7                    <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,…
## $ optin_time_week2_x_optin_time_wday.lbl_1 <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,…
## $ optin_time_week2_x_optin_time_wday.lbl_2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_week2_x_optin_time_wday.lbl_3 <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_week2_x_optin_time_wday.lbl_4 <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_week2_x_optin_time_wday.lbl_5 <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_week2_x_optin_time_wday.lbl_6 <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,…
## $ optin_time_week2_x_optin_time_wday.lbl_7 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,…
## $ optin_time_sin7_K1                       <dbl> -9.749279e-01, -7.818315e-01,…
## $ optin_time_cos7_K1                       <dbl> -0.2225209, 0.6234898, 1.0000…
## $ optin_time_sin7_K2                       <dbl> 4.338837e-01, -9.749279e-01, …
## $ optin_time_cos7_K2                       <dbl> -0.9009689, -0.2225209, 1.000…
## $ optin_time_sin14_K1                      <dbl> 7.818315e-01, 4.338837e-01, -…
## $ optin_time_cos14_K1                      <dbl> -0.6234898, -0.9009689, -1.00…
## $ optin_time_sin14_K2                      <dbl> -9.749279e-01, -7.818315e-01,…
## $ optin_time_cos14_K2                      <dbl> -0.2225209, 0.6234898, 1.0000…
## $ optin_time_sin30_K1                      <dbl> 1.126516e-13, -2.079117e-01, …
## $ optin_time_cos30_K1                      <dbl> -1.0000000, -0.9781476, -0.91…
## $ optin_time_sin30_K2                      <dbl> -2.253032e-13, 4.067366e-01, …
## $ optin_time_cos30_K2                      <dbl> 1.0000000, 0.9135455, 0.66913…
## $ optin_time_sin90_K1                      <dbl> -8.660254e-01, -8.290376e-01,…
## $ optin_time_cos90_K1                      <dbl> 0.5000000, 0.5591929, 0.61566…
## $ optin_time_sin90_K2                      <dbl> -8.660254e-01, -9.271839e-01,…
## $ optin_time_cos90_K2                      <dbl> -0.5000000, -0.3746066, -0.24…
## $ optin_time_sin365_K1                     <dbl> -0.2135209, -0.2303057, -0.24…
## $ optin_time_cos365_K1                     <dbl> -0.9769385, -0.9731183, -0.96…
## $ optin_time_sin365_K2                     <dbl> 0.4171936, 0.4482293, 0.47873…
## $ optin_time_cos365_K2                     <dbl> 0.9088176, 0.8939186, 0.87796…

recipe_spec_1

#taking optin_time which is date time format out
#it still has index.num information in it. 
#it is also taking lag related features out
recipe_spec_1 <- recipe_spec_base %>%
    step_rm(optin_time) %>%
    #adds a step for a natural spline transformation 
    step_ns(ends_with("index.num"), deg_free = 2) %>%
    step_rm(starts_with("lag_"))

recipe_spec_2

  • Lag Recipe

step_naomit()

  • Removes rows with missing values from the columns specified.
recipe_spec_base %>% prep() %>% juice() %>% glimpse()
## Rows: 553
## Columns: 68
## $ optin_time                               <date> 2018-07-03, 2018-07-04, 2018…
## $ optins_trans_lag56                       <dbl> NA, NA, NA, NA, NA, NA, NA, N…
## $ optins_trans_lag56_roll_30               <dbl> NA, NA, NA, NA, NA, NA, NA, N…
## $ optins_trans_lag56_roll_60               <dbl> NA, NA, NA, NA, NA, NA, NA, N…
## $ optins_trans_lag56_roll_90               <dbl> NA, NA, NA, NA, NA, NA, NA, N…
## $ optins_trans                             <dbl> -0.49190597, -0.15340526, -0.…
## $ optin_time_index.num                     <dbl> -1.727358, -1.721099, -1.7148…
## $ optin_time_year                          <dbl> -1.394192, -1.394192, -1.3941…
## $ optin_time_half                          <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,…
## $ optin_time_quarter                       <int> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,…
## $ optin_time_month                         <int> 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,…
## $ optin_time_day                           <int> 3, 4, 5, 6, 7, 8, 9, 10, 11, …
## $ optin_time_wday                          <int> 3, 4, 5, 6, 7, 1, 2, 3, 4, 5,…
## $ optin_time_mday                          <int> 3, 4, 5, 6, 7, 8, 9, 10, 11, …
## $ optin_time_qday                          <int> 3, 4, 5, 6, 7, 8, 9, 10, 11, …
## $ optin_time_yday                          <dbl> -0.26427519, -0.25454717, -0.…
## $ optin_time_mweek                         <int> 1, 1, 1, 1, 1, 1, 2, 2, 2, 2,…
## $ optin_time_week                          <int> 27, 27, 27, 27, 27, 27, 28, 2…
## $ optin_time_week2                         <int> 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,…
## $ optin_time_week3                         <int> 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,…
## $ optin_time_week4                         <int> 3, 3, 3, 3, 3, 3, 0, 0, 0, 0,…
## $ optin_time_mday7                         <int> 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,…
## $ optin_time_month.lbl_01                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_02                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_03                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_04                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_05                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_06                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_07                  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ optin_time_month.lbl_08                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_09                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_10                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_11                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_12                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_wday.lbl_1                    <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,…
## $ optin_time_wday.lbl_2                    <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,…
## $ optin_time_wday.lbl_3                    <dbl> 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,…
## $ optin_time_wday.lbl_4                    <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,…
## $ optin_time_wday.lbl_5                    <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,…
## $ optin_time_wday.lbl_6                    <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,…
## $ optin_time_wday.lbl_7                    <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,…
## $ optin_time_week2_x_optin_time_wday.lbl_1 <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,…
## $ optin_time_week2_x_optin_time_wday.lbl_2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_week2_x_optin_time_wday.lbl_3 <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_week2_x_optin_time_wday.lbl_4 <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_week2_x_optin_time_wday.lbl_5 <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_week2_x_optin_time_wday.lbl_6 <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,…
## $ optin_time_week2_x_optin_time_wday.lbl_7 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,…
## $ optin_time_sin7_K1                       <dbl> -9.749279e-01, -7.818315e-01,…
## $ optin_time_cos7_K1                       <dbl> -0.2225209, 0.6234898, 1.0000…
## $ optin_time_sin7_K2                       <dbl> 4.338837e-01, -9.749279e-01, …
## $ optin_time_cos7_K2                       <dbl> -0.9009689, -0.2225209, 1.000…
## $ optin_time_sin14_K1                      <dbl> 7.818315e-01, 4.338837e-01, -…
## $ optin_time_cos14_K1                      <dbl> -0.6234898, -0.9009689, -1.00…
## $ optin_time_sin14_K2                      <dbl> -9.749279e-01, -7.818315e-01,…
## $ optin_time_cos14_K2                      <dbl> -0.2225209, 0.6234898, 1.0000…
## $ optin_time_sin30_K1                      <dbl> 1.126516e-13, -2.079117e-01, …
## $ optin_time_cos30_K1                      <dbl> -1.0000000, -0.9781476, -0.91…
## $ optin_time_sin30_K2                      <dbl> -2.253032e-13, 4.067366e-01, …
## $ optin_time_cos30_K2                      <dbl> 1.0000000, 0.9135455, 0.66913…
## $ optin_time_sin90_K1                      <dbl> -8.660254e-01, -8.290376e-01,…
## $ optin_time_cos90_K1                      <dbl> 0.5000000, 0.5591929, 0.61566…
## $ optin_time_sin90_K2                      <dbl> -8.660254e-01, -9.271839e-01,…
## $ optin_time_cos90_K2                      <dbl> -0.5000000, -0.3746066, -0.24…
## $ optin_time_sin365_K1                     <dbl> -0.2135209, -0.2303057, -0.24…
## $ optin_time_cos365_K1                     <dbl> -0.9769385, -0.9731183, -0.96…
## $ optin_time_sin365_K2                     <dbl> 0.4171936, 0.4482293, 0.47873…
## $ optin_time_cos365_K2                     <dbl> 0.9088176, 0.8939186, 0.87796…
recipe_spec_2 <- recipe_spec_base %>%
    step_rm(optin_time) %>%
    #removes rows with missing value based on the values stored
    #under the column names that starts with lag_
    step_naomit(starts_with("lag_"))
    

recipe_spec_2 %>% prep() %>% juice() %>% glimpse()
## Rows: 553
## Columns: 67
## $ optins_trans_lag56                       <dbl> NA, NA, NA, NA, NA, NA, NA, N…
## $ optins_trans_lag56_roll_30               <dbl> NA, NA, NA, NA, NA, NA, NA, N…
## $ optins_trans_lag56_roll_60               <dbl> NA, NA, NA, NA, NA, NA, NA, N…
## $ optins_trans_lag56_roll_90               <dbl> NA, NA, NA, NA, NA, NA, NA, N…
## $ optins_trans                             <dbl> -0.49190597, -0.15340526, -0.…
## $ optin_time_index.num                     <dbl> -1.727358, -1.721099, -1.7148…
## $ optin_time_year                          <dbl> -1.394192, -1.394192, -1.3941…
## $ optin_time_half                          <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,…
## $ optin_time_quarter                       <int> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,…
## $ optin_time_month                         <int> 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,…
## $ optin_time_day                           <int> 3, 4, 5, 6, 7, 8, 9, 10, 11, …
## $ optin_time_wday                          <int> 3, 4, 5, 6, 7, 1, 2, 3, 4, 5,…
## $ optin_time_mday                          <int> 3, 4, 5, 6, 7, 8, 9, 10, 11, …
## $ optin_time_qday                          <int> 3, 4, 5, 6, 7, 8, 9, 10, 11, …
## $ optin_time_yday                          <dbl> -0.26427519, -0.25454717, -0.…
## $ optin_time_mweek                         <int> 1, 1, 1, 1, 1, 1, 2, 2, 2, 2,…
## $ optin_time_week                          <int> 27, 27, 27, 27, 27, 27, 28, 2…
## $ optin_time_week2                         <int> 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,…
## $ optin_time_week3                         <int> 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,…
## $ optin_time_week4                         <int> 3, 3, 3, 3, 3, 3, 0, 0, 0, 0,…
## $ optin_time_mday7                         <int> 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,…
## $ optin_time_month.lbl_01                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_02                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_03                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_04                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_05                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_06                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_07                  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ optin_time_month.lbl_08                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_09                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_10                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_11                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_month.lbl_12                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_wday.lbl_1                    <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,…
## $ optin_time_wday.lbl_2                    <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,…
## $ optin_time_wday.lbl_3                    <dbl> 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,…
## $ optin_time_wday.lbl_4                    <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,…
## $ optin_time_wday.lbl_5                    <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,…
## $ optin_time_wday.lbl_6                    <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,…
## $ optin_time_wday.lbl_7                    <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,…
## $ optin_time_week2_x_optin_time_wday.lbl_1 <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,…
## $ optin_time_week2_x_optin_time_wday.lbl_2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_week2_x_optin_time_wday.lbl_3 <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_week2_x_optin_time_wday.lbl_4 <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_week2_x_optin_time_wday.lbl_5 <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,…
## $ optin_time_week2_x_optin_time_wday.lbl_6 <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,…
## $ optin_time_week2_x_optin_time_wday.lbl_7 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,…
## $ optin_time_sin7_K1                       <dbl> -9.749279e-01, -7.818315e-01,…
## $ optin_time_cos7_K1                       <dbl> -0.2225209, 0.6234898, 1.0000…
## $ optin_time_sin7_K2                       <dbl> 4.338837e-01, -9.749279e-01, …
## $ optin_time_cos7_K2                       <dbl> -0.9009689, -0.2225209, 1.000…
## $ optin_time_sin14_K1                      <dbl> 7.818315e-01, 4.338837e-01, -…
## $ optin_time_cos14_K1                      <dbl> -0.6234898, -0.9009689, -1.00…
## $ optin_time_sin14_K2                      <dbl> -9.749279e-01, -7.818315e-01,…
## $ optin_time_cos14_K2                      <dbl> -0.2225209, 0.6234898, 1.0000…
## $ optin_time_sin30_K1                      <dbl> 1.126516e-13, -2.079117e-01, …
## $ optin_time_cos30_K1                      <dbl> -1.0000000, -0.9781476, -0.91…
## $ optin_time_sin30_K2                      <dbl> -2.253032e-13, 4.067366e-01, …
## $ optin_time_cos30_K2                      <dbl> 1.0000000, 0.9135455, 0.66913…
## $ optin_time_sin90_K1                      <dbl> -8.660254e-01, -8.290376e-01,…
## $ optin_time_cos90_K1                      <dbl> 0.5000000, 0.5591929, 0.61566…
## $ optin_time_sin90_K2                      <dbl> -8.660254e-01, -9.271839e-01,…
## $ optin_time_cos90_K2                      <dbl> -0.5000000, -0.3746066, -0.24…
## $ optin_time_sin365_K1                     <dbl> -0.2135209, -0.2303057, -0.24…
## $ optin_time_cos365_K1                     <dbl> -0.9769385, -0.9731183, -0.96…
## $ optin_time_sin365_K2                     <dbl> 0.4171936, 0.4482293, 0.47873…
## $ optin_time_cos365_K2                     <dbl> 0.9088176, 0.8939186, 0.87796…

Modeltime

create model

  • Don’t fit the model at this point.
model_spec_lm <- linear_reg() %>%
    set_engine("lm")

create workflow

  • workflow requires model and recipe objects
  • you need to fit the model within workflow before storing it into modeltime table
workflow_fit_lm_1_spline <- workflow() %>%
    add_model(model_spec_lm) %>%
    add_recipe(recipe_spec_1) %>%
    fit(training(splits))

model_tbl

  • create modeltime table
model_tbl <- modeltime_table(
    workflow_fit_lm_1_spline
)

calibration_tbl

  • create calibration table
  • contains confidence interval
calibration_tbl <- model_tbl %>%
    modeltime_calibrate(new_data = testing(splits))
## Warning: There was 1 warning in `dplyr::mutate()`.
## ℹ In argument: `.nested.col = purrr::map2(...)`.
## Caused by warning in `predict.lm()`:
## ! prediction from rank-deficient fit; consider predict(., rankdeficient="NA")
calibration_tbl %>% modeltime_accuracy()
## # A tibble: 1 × 9
##   .model_id .model_desc .type   mae  mape  mase smape  rmse   rsq
##       <int> <chr>       <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1         1 LM          Test  0.670  873. 0.907  128. 0.917 0.184
calibration_tbl %>%
    modeltime_forecast(new_data    = testing(splits), 
                       actual_data = data_prepared_tbl) %>%
    plot_modeltime_forecast()
## Warning: There was 1 warning in `dplyr::mutate()`.
## ℹ In argument: `.nested.col = purrr::map2(...)`.
## Caused by warning in `predict.lm()`:
## ! prediction from rank-deficient fit; consider predict(., rankdeficient="NA")
  • Following the same process, let’s create another workfolw object and store it inside modeltime
workflow_fit_lm_2_lag <- workflow() %>%
    add_model(model_spec_lm) %>%
    add_recipe(recipe_spec_2) %>%
    fit(training(splits))

modeltime_tbl <- modeltime_table(
    workflow_fit_lm_1_spline,
    workflow_fit_lm_2_lag
) 
#create prediction interval for two models
calibration_tbl <- modeltime_tbl %>%
    modeltime_calibrate(new_data = testing(splits))
## Warning: There were 2 warnings in `dplyr::mutate()`.
## The first warning was:
## ℹ In argument: `.nested.col = purrr::map2(...)`.
## Caused by warning in `predict.lm()`:
## ! prediction from rank-deficient fit; consider predict(., rankdeficient="NA")
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
#check the model performance
calibration_tbl %>% modeltime_accuracy()
## # A tibble: 2 × 9
##   .model_id .model_desc .type   mae  mape  mase smape  rmse   rsq
##       <int> <chr>       <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1         1 LM          Test  0.670  873. 0.907  128. 0.917 0.184
## 2         2 LM          Test  0.761  875. 1.03   142. 0.999 0.245
#plot them
calibration_tbl %>%
    modeltime_forecast(new_data    = testing(splits), 
                       actual_data = data_prepared_tbl) %>%
    plot_modeltime_forecast()
## Warning: There was 1 warning in `dplyr::mutate()`.
## ℹ In argument: `.nested.col = purrr::map2(...)`.
## Caused by warning in `predict.lm()`:
## ! prediction from rank-deficient fit; consider predict(., rankdeficient="NA")
## Warning: There was 1 warning in `dplyr::mutate()`.
## ℹ In argument: `.nested.col = purrr::map2(...)`.
## Caused by warning in `predict.lm()`:
## ! prediction from rank-deficient fit; consider predict(., rankdeficient="NA")

refit

  • Retrain each of your models on the full dataset.
    • This tends to improve performance.
refit_tbl <- calibration_tbl %>%
    modeltime_refit(data = data_prepared_tbl)

Order of inversion operations

  • The order of inversion is important.
    • It’s always the reverse of the original transformation
refit_tbl %>%
    modeltime_forecast(new_data    = forecast_tbl,
                       actual_data = data_prepared_tbl) %>%
    
    # Invert Transformation
    mutate(across(.value:.conf_hi, .fns = ~ standardize_inv_vec(
        x    = .,
        mean = std_mean,
        sd   = std_sd
    ))) %>%
    mutate(across(.value:.conf_hi, .fns = ~ log_interval_inv_vec(
        x           = ., 
        limit_lower = limit_lower, 
        limit_upper = limit_upper, 
        offset      = offset
    ))) %>%
    
    plot_modeltime_forecast()
## Warning: There was 1 warning in `dplyr::mutate()`.
## ℹ In argument: `.nested.col = purrr::map2(...)`.
## Caused by warning in `predict.lm()`:
## ! prediction from rank-deficient fit; consider predict(., rankdeficient="NA")
## There was 1 warning in `dplyr::mutate()`.
## ℹ In argument: `.nested.col = purrr::map2(...)`.
## Caused by warning in `predict.lm()`:
## ! prediction from rank-deficient fit; consider predict(., rankdeficient="NA")

Save the artifacts

feature_engineering_artifacts_list <- list(
    # Data
    data = list(
        data_prepared_tbl = data_prepared_tbl,
        forecast_tbl      = forecast_tbl 
    ),
    
    # Recipes
    recipes = list(
        recipe_spec_base = recipe_spec_base,
        recipe_spec_1    = recipe_spec_1, 
        recipe_spec_2    = recipe_spec_2
    ),
    
    # Models / Workflows
    models = list(
        workflow_fit_lm_1_spline = workflow_fit_lm_1_spline,
        workflow_fit_lm_2_lag    = workflow_fit_lm_2_lag
    ),
    
    
    # Inversion Parameters
    standardize = list(
        std_mean = std_mean,
        std_sd   = std_sd
    ),
    log_interval = list(
        limit_lower = limit_lower, 
        limit_upper = limit_upper,
        offset      = offset
    )
    
)

feature_engineering_artifacts_list
## $data
## $data$data_prepared_tbl
## # A tibble: 609 × 6
##    optin_time optins_trans optins_trans_lag56 optins_trans_lag56_roll_30
##    <date>            <dbl>              <dbl>                      <dbl>
##  1 2018-07-03      -0.492                  NA                         NA
##  2 2018-07-04      -0.153                  NA                         NA
##  3 2018-07-05      -0.578                  NA                         NA
##  4 2018-07-06      -0.413                  NA                         NA
##  5 2018-07-07      -1.20                   NA                         NA
##  6 2018-07-08      -1.66                   NA                         NA
##  7 2018-07-09      -0.274                  NA                         NA
##  8 2018-07-10      -0.212                  NA                         NA
##  9 2018-07-11      -0.0986                 NA                         NA
## 10 2018-07-12      -0.274                  NA                         NA
## # ℹ 599 more rows
## # ℹ 2 more variables: optins_trans_lag56_roll_60 <dbl>,
## #   optins_trans_lag56_roll_90 <dbl>
## 
## $data$forecast_tbl
## # A tibble: 56 × 6
##    optin_time optins_trans optins_trans_lag56 optins_trans_lag56_roll_30
##    <date>            <dbl>              <dbl>                      <dbl>
##  1 2020-03-03           NA             1.06                        0.443
##  2 2020-03-04           NA             2.07                        0.463
##  3 2020-03-05           NA             1.36                        0.509
##  4 2020-03-06           NA             0.251                       0.524
##  5 2020-03-07           NA            -0.779                       0.534
##  6 2020-03-08           NA             0.0926                      0.556
##  7 2020-03-09           NA             0.631                       0.540
##  8 2020-03-10           NA             0.385                       0.521
##  9 2020-03-11           NA             0.446                       0.511
## 10 2020-03-12           NA             0.135                       0.524
## # ℹ 46 more rows
## # ℹ 2 more variables: optins_trans_lag56_roll_60 <dbl>,
## #   optins_trans_lag56_roll_90 <dbl>
## 
## 
## $recipes
## $recipes$recipe_spec_base
## 
## ── Recipe ──────────────────────────────────────────────────────────────────────
## 
## ── Inputs
## Number of variables by role
## outcome:   1
## predictor: 5
## 
## ── Operations
## • Timeseries signature features from: optin_time
## • Variables removed: matches("(iso)|(xts)|(hour)|(minute)|(second)|(am.pm)")
## • Centering and scaling for: matches("(index.num)|(year)|(yday)")
## • Dummy variables from: all_nominal()
## • Interactions with: matches("week2") * matches("wday.lbl")
## • Fourier series features from: optin_time
## 
## $recipes$recipe_spec_1
## 
## ── Recipe ──────────────────────────────────────────────────────────────────────
## 
## ── Inputs
## Number of variables by role
## outcome:   1
## predictor: 5
## 
## ── Operations
## • Timeseries signature features from: optin_time
## • Variables removed: matches("(iso)|(xts)|(hour)|(minute)|(second)|(am.pm)")
## • Centering and scaling for: matches("(index.num)|(year)|(yday)")
## • Dummy variables from: all_nominal()
## • Interactions with: matches("week2") * matches("wday.lbl")
## • Fourier series features from: optin_time
## • Variables removed: optin_time
## • Natural splines on: ends_with("index.num")
## • Variables removed: starts_with("lag_")
## 
## $recipes$recipe_spec_2
## 
## ── Recipe ──────────────────────────────────────────────────────────────────────
## 
## ── Inputs
## Number of variables by role
## outcome:   1
## predictor: 5
## 
## ── Operations
## • Timeseries signature features from: optin_time
## • Variables removed: matches("(iso)|(xts)|(hour)|(minute)|(second)|(am.pm)")
## • Centering and scaling for: matches("(index.num)|(year)|(yday)")
## • Dummy variables from: all_nominal()
## • Interactions with: matches("week2") * matches("wday.lbl")
## • Fourier series features from: optin_time
## • Variables removed: optin_time
## • Removing rows with NA values in: starts_with("lag_")
## 
## 
## $models
## $models$workflow_fit_lm_1_spline
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: linear_reg()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 9 Recipe Steps
## 
## • step_timeseries_signature()
## • step_rm()
## • step_normalize()
## • step_dummy()
## • step_interact()
## • step_fourier()
## • step_rm()
## • step_ns()
## • step_rm()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## 
## Call:
## stats::lm(formula = ..y ~ ., data = data)
## 
## Coefficients:
##                              (Intercept)  
##                               -4.131e+02  
##                       optins_trans_lag56  
##                               -2.963e-02  
##               optins_trans_lag56_roll_30  
##                                7.936e-01  
##               optins_trans_lag56_roll_60  
##                               -1.341e+00  
##               optins_trans_lag56_roll_90  
##                               -1.182e+00  
##                          optin_time_year  
##                                1.024e+00  
##                          optin_time_half  
##                               -5.678e-02  
##                       optin_time_quarter  
##                                6.313e+01  
##                         optin_time_month  
##                                2.349e+01  
##                           optin_time_day  
##                                7.787e-01  
##                          optin_time_wday  
##                               -2.292e-01  
##                          optin_time_mday  
##                                       NA  
##                          optin_time_qday  
##                                6.941e-01  
##                          optin_time_yday  
##                               -1.563e+02  
##                         optin_time_mweek  
##                               -5.133e-02  
##                          optin_time_week  
##                                4.380e-01  
##                         optin_time_week2  
##                                3.117e-01  
##                         optin_time_week3  
##                                1.782e-02  
##                         optin_time_week4  
##                               -1.144e-02  
##                         optin_time_mday7  
##                               -2.573e-02  
##                  optin_time_month.lbl_01  
##                                1.398e+00  
##                  optin_time_month.lbl_02  
##                                2.184e+00  
##                  optin_time_month.lbl_03  
## 
## ...
## and 92 more lines.
## 
## $models$workflow_fit_lm_2_lag
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: linear_reg()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 8 Recipe Steps
## 
## • step_timeseries_signature()
## • step_rm()
## • step_normalize()
## • step_dummy()
## • step_interact()
## • step_fourier()
## • step_rm()
## • step_naomit()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## 
## Call:
## stats::lm(formula = ..y ~ ., data = data)
## 
## Coefficients:
##                              (Intercept)  
##                               -3.934e+02  
##                       optins_trans_lag56  
##                               -1.898e-02  
##               optins_trans_lag56_roll_30  
##                                1.189e+00  
##               optins_trans_lag56_roll_60  
##                               -1.333e+00  
##               optins_trans_lag56_roll_90  
##                                1.054e+00  
##                     optin_time_index.num  
##                               -2.355e+02  
##                          optin_time_year  
##                                2.628e+02  
##                          optin_time_half  
##                               -3.475e-02  
##                       optin_time_quarter  
##                                5.936e+01  
##                         optin_time_month  
##                                2.297e+01  
##                           optin_time_day  
##                                7.589e-01  
##                          optin_time_wday  
##                               -2.277e-01  
##                          optin_time_mday  
##                                       NA  
##                          optin_time_qday  
##                                6.545e-01  
##                          optin_time_yday  
##                                       NA  
##                         optin_time_mweek  
##                               -4.540e-02  
##                          optin_time_week  
##                                4.645e-01  
##                         optin_time_week2  
##                                2.713e-01  
##                         optin_time_week3  
##                                1.398e-02  
##                         optin_time_week4  
##                               -1.267e-02  
##                         optin_time_mday7  
##                               -2.454e-02  
##                  optin_time_month.lbl_01  
##                                1.347e+00  
##                  optin_time_month.lbl_02  
## 
## ...
## and 90 more lines.
## 
## 
## $standardize
## $standardize$std_mean
## [1] -5.25529
## 
## $standardize$std_sd
## [1] 1.110982
## 
## 
## $log_interval
## $log_interval$limit_lower
## [1] 0
## 
## $log_interval$limit_upper
## [1] 3650.8
## 
## $log_interval$offset
## [1] 1
feature_engineering_artifacts_list %>%
    write_rds("model/feature_engineering_artifacts_list.rds")